Read the data

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

data.set <- read.table("dataLA.prn", header=TRUE, na.strings="*")

KeyStroke Distributions

p.10 <- ggplot(data=data.set, aes(keystroke10, colour=name)) + 
  geom_histogram(fill="white")
p.10
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p.20 <- ggplot(data=data.set, aes(keystroke20, colour=name)) + 
  geom_histogram(fill="white")
p.20
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p.30 <- ggplot(data=data.set, aes(keystroke30, colour=name)) + 
  geom_histogram(fill="white")
p.30
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p.40 <- ggplot(data=data.set, aes(keystroke40, colour=name)) + 
  geom_histogram(fill="white")
p.40
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p.50 <- ggplot(data=data.set, aes(keystroke50, colour=name)) + 
  geom_histogram(fill="white")
p.50
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p.60 <- ggplot(data=data.set, aes(keystroke60, colour=name)) + 
  geom_histogram(fill="white")
p.60
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p.70 <- ggplot(data=data.set, aes(keystroke70, colour=name)) + 
  geom_histogram(fill="white")
p.70
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p.80 <- ggplot(data=data.set, aes(keystroke80, colour=name)) + 
  geom_histogram(fill="white")
p.80
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p.90 <- ggplot(data=data.set, aes(keystroke90, colour=name)) + 
  geom_histogram(fill="white")
p.90
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p.100 <- ggplot(data=data.set, aes(keystroke100, colour=name)) + 
  geom_histogram(fill="white")
p.100
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Correlation matrix

require(corrplot)
## Loading required package: corrplot
## corrplot 0.84 loaded
library(corrplot)
data.set$essay_typebin <- ifelse(data.set$essay_type == "True", 1, 0)
cor_data <- data.set[, 3:21]
corr_matrix <- cor(cor_data)
#simple correlation matrix
round(corr_matrix, 2)
##              keystroke10 keystroke20 keystroke30 keystroke40 keystroke50
## keystroke10         1.00        0.28        0.23        0.29        0.06
## keystroke20         0.28        1.00        0.47        0.37        0.40
## keystroke30         0.23        0.47        1.00        0.34        0.45
## keystroke40         0.29        0.37        0.34        1.00        0.48
## keystroke50         0.06        0.40        0.45        0.48        1.00
## keystroke60         0.03        0.43        0.37        0.30        0.58
## keystroke70         0.15        0.03        0.18        0.35        0.10
## keystroke80        -0.01        0.20        0.15        0.12        0.34
## keystroke90         0.08        0.13        0.06        0.12        0.17
## keystroke100       -0.05        0.09        0.06        0.02        0.03
## keystroke110       -0.02       -0.13        0.02       -0.04       -0.08
## keystroke120       -0.01        0.01       -0.13        0.00       -0.08
## keystroke130       -0.07       -0.08        0.04       -0.12       -0.04
## keystroke140       -0.10       -0.11       -0.17       -0.08       -0.14
## keystroke150        0.04       -0.13       -0.16       -0.06       -0.15
## keystroke160       -0.03       -0.09       -0.10       -0.13       -0.08
## keystroke170        0.00       -0.14       -0.18       -0.11       -0.12
## ouliers             0.00       -0.18       -0.20       -0.16       -0.14
## essay_sent          0.01       -0.04       -0.01       -0.02       -0.03
##              keystroke60 keystroke70 keystroke80 keystroke90 keystroke100
## keystroke10         0.03        0.15       -0.01        0.08        -0.05
## keystroke20         0.43        0.03        0.20        0.13         0.09
## keystroke30         0.37        0.18        0.15        0.06         0.06
## keystroke40         0.30        0.35        0.12        0.12         0.02
## keystroke50         0.58        0.10        0.34        0.17         0.03
## keystroke60         1.00        0.30        0.43        0.25         0.12
## keystroke70         0.30        1.00        0.21        0.22         0.13
## keystroke80         0.43        0.21        1.00        0.49         0.27
## keystroke90         0.25        0.22        0.49        1.00         0.41
## keystroke100        0.12        0.13        0.27        0.41         1.00
## keystroke110       -0.22        0.09        0.01        0.08         0.39
## keystroke120       -0.16       -0.14        0.09        0.26         0.33
## keystroke130       -0.10       -0.17       -0.04        0.05         0.31
## keystroke140       -0.22       -0.12       -0.21       -0.09         0.17
## keystroke150       -0.22       -0.07       -0.14       -0.01         0.02
## keystroke160       -0.15       -0.17       -0.04       -0.06         0.01
## keystroke170       -0.18       -0.08       -0.12       -0.01        -0.03
## ouliers            -0.18       -0.12       -0.08       -0.06        -0.02
## essay_sent         -0.04        0.00       -0.06       -0.04        -0.04
##              keystroke110 keystroke120 keystroke130 keystroke140 keystroke150
## keystroke10         -0.02        -0.01        -0.07        -0.10         0.04
## keystroke20         -0.13         0.01        -0.08        -0.11        -0.13
## keystroke30          0.02        -0.13         0.04        -0.17        -0.16
## keystroke40         -0.04         0.00        -0.12        -0.08        -0.06
## keystroke50         -0.08        -0.08        -0.04        -0.14        -0.15
## keystroke60         -0.22        -0.16        -0.10        -0.22        -0.22
## keystroke70          0.09        -0.14        -0.17        -0.12        -0.07
## keystroke80          0.01         0.09        -0.04        -0.21        -0.14
## keystroke90          0.08         0.26         0.05        -0.09        -0.01
## keystroke100         0.39         0.33         0.31         0.17         0.02
## keystroke110         1.00         0.35         0.32         0.35         0.23
## keystroke120         0.35         1.00         0.54         0.38         0.43
## keystroke130         0.32         0.54         1.00         0.46         0.48
## keystroke140         0.35         0.38         0.46         1.00         0.58
## keystroke150         0.23         0.43         0.48         0.58         1.00
## keystroke160         0.12         0.41         0.50         0.51         0.60
## keystroke170         0.07         0.30         0.32         0.47         0.61
## ouliers              0.07         0.28         0.35         0.48         0.58
## essay_sent           0.00        -0.04        -0.01         0.00        -0.02
##              keystroke160 keystroke170 ouliers essay_sent
## keystroke10         -0.03         0.00    0.00       0.01
## keystroke20         -0.09        -0.14   -0.18      -0.04
## keystroke30         -0.10        -0.18   -0.20      -0.01
## keystroke40         -0.13        -0.11   -0.16      -0.02
## keystroke50         -0.08        -0.12   -0.14      -0.03
## keystroke60         -0.15        -0.18   -0.18      -0.04
## keystroke70         -0.17        -0.08   -0.12       0.00
## keystroke80         -0.04        -0.12   -0.08      -0.06
## keystroke90         -0.06        -0.01   -0.06      -0.04
## keystroke100         0.01        -0.03   -0.02      -0.04
## keystroke110         0.12         0.07    0.07       0.00
## keystroke120         0.41         0.30    0.28      -0.04
## keystroke130         0.50         0.32    0.35      -0.01
## keystroke140         0.51         0.47    0.48       0.00
## keystroke150         0.60         0.61    0.58      -0.02
## keystroke160         1.00         0.61    0.75      -0.02
## keystroke170         0.61         1.00    0.78       0.00
## ouliers              0.75         0.78    1.00      -0.02
## essay_sent          -0.02         0.00   -0.02       1.00
#visualized
corrplot(corr_matrix, type = 'upper', order = "hclust", 
         tl.col = "black", tl.srt = 45)

We will add keystokes over 110 to outliers later because there is a strong correlation ### Plot Data Construction

val.vec <- c()
type.vec <- c()
sent.vec <- c()
theme.vec <- c()
for (i in 1:nrow(data.set)) {
  val <- max(data.set[i, 3:19])
  type <- data.set[i, 22]
  sent <- data.set[i, 21]
  theme <- data.set[i, 2]
  val.vec <- c(val.vec, val)
  type.vec <- c(type.vec, type)
  sent.vec <- c(sent.vec, sent)
  theme.vec <- c(theme.vec, theme)
}
plot.data <- data.frame(theme.vec, val.vec, type.vec, sent.vec)
colnames(plot.data) <- c('Theme', 'MaxKeystroke', "TypeofEssay", "Sentiment")

Graphs on Dependencies between Maximum Keystroke Value and its sentiment, grouped by theme

#Gay Marriage
ggplot(data=plot.data %>% filter(plot.data$Theme=='GayMerriage'), aes(MaxKeystroke, colour=TypeofEssay)) + geom_histogram() + ggtitle("Histogram of maximum keystroke over the essay on Gay Marriage")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=plot.data %>% filter(plot.data$Theme=='GayMerriage'), aes(x = MaxKeystroke, y =Sentiment, colour=TypeofEssay)) + geom_point() + ggtitle("Dependency of maximum keystroke over the essay on Gay Marriage and its sentiment")

#Gun Control
ggplot(data=plot.data %>% filter(plot.data$Theme=='GunControl'), aes(MaxKeystroke, colour=TypeofEssay)) + geom_histogram() + ggtitle("Histogram of maximum keystroke over the essay on Gun Control")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=plot.data %>% filter(plot.data$Theme=='GunControl'), aes(x = MaxKeystroke, y =Sentiment, colour=TypeofEssay)) + geom_point() + ggtitle("Dependency of maximum keystroke over the essay on Gun Control and its sentiment")

#ReviewAMT
ggplot(data=plot.data %>% filter(plot.data$Theme=='ReviewAMT'), aes(MaxKeystroke, colour=TypeofEssay)) + geom_histogram() + ggtitle("Dependency of maximum keystroke over the essay on Review and its sentiment")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=plot.data %>% filter(plot.data$Theme=='ReviewAMT'), aes(x = MaxKeystroke, y =Sentiment, colour=TypeofEssay)) + geom_point() + ggtitle("Histogram of maximum keystroke over the essay on Review")

People writing about Gay Marriage or Gun Control try to keep neutural position compared to essays on Review of the AMT where sentiment is either very negative or very positive. Also we can say that huge keystroke almost never correspond to neutural opinion, so people take more time to write positive or negative thoughts.

Simple Linear Model over Sentiment and Keystrokes

model.1 <- lm(essay_sent ~ keystroke10 + keystroke20 + keystroke30 + keystroke30 + keystroke40 + 
                keystroke50 + keystroke60 + keystroke70 + keystroke80 + keystroke90 + keystroke100 + 
                keystroke110 + keystroke120 + keystroke130 + keystroke140 + keystroke150 + keystroke160
              + keystroke170 , data=data.set)
summary(model.1)
## 
## Call:
## lm(formula = essay_sent ~ keystroke10 + keystroke20 + keystroke30 + 
##     keystroke30 + keystroke40 + keystroke50 + keystroke60 + keystroke70 + 
##     keystroke80 + keystroke90 + keystroke100 + keystroke110 + 
##     keystroke120 + keystroke130 + keystroke140 + keystroke150 + 
##     keystroke160 + keystroke170, data = data.set)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.730 -3.383  1.605  2.437  3.757 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.9150298  0.1152094  51.342   <2e-16 ***
## keystroke10   0.0024895  0.0023522   1.058    0.290    
## keystroke20  -0.0043525  0.0029372  -1.482    0.138    
## keystroke30   0.0006874  0.0042020   0.164    0.870    
## keystroke40  -0.0001009  0.0032481  -0.031    0.975    
## keystroke50  -0.0021723  0.0040944  -0.531    0.596    
## keystroke60  -0.0010607  0.0027784  -0.382    0.703    
## keystroke70   0.0013293  0.0026067   0.510    0.610    
## keystroke80  -0.0054675  0.0028725  -1.903    0.057 .  
## keystroke90  -0.0012953  0.0034029  -0.381    0.703    
## keystroke100 -0.0034422  0.0026537  -1.297    0.195    
## keystroke110  0.0024434  0.0035609   0.686    0.493    
## keystroke120 -0.0072593  0.0049708  -1.460    0.144    
## keystroke130  0.0072175  0.0066466   1.086    0.278    
## keystroke140  0.0029920  0.0052392   0.571    0.568    
## keystroke150 -0.0192753  0.0129616  -1.487    0.137    
## keystroke160 -0.0146290  0.0127138  -1.151    0.250    
## keystroke170  0.0156815  0.0199026   0.788    0.431    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.682 on 4782 degrees of freedom
## Multiple R-squared:  0.007395,   Adjusted R-squared:  0.003866 
## F-statistic: 2.096 on 17 and 4782 DF,  p-value: 0.005268
ggplot(data=data.set, aes(x = keystroke10 + keystroke20 + keystroke30 + keystroke30 + keystroke40 + 
                keystroke50 + keystroke60 + keystroke70 + keystroke80 + keystroke90 + keystroke100 + 
                keystroke110 + keystroke120 + keystroke130 + keystroke140 + keystroke150 + keystroke160
              + keystroke170,  y= essay_sent)) + geom_point() + geom_smooth(method = 'lm')
## `geom_smooth()` using formula 'y ~ x'

model_pred <- predict(model.1)
model.res <- data.set$essay_sent - model_pred
plot(model_pred, model.res)
abline(h=0)

Fighting the problem of too many variables, because they become insignificant. Let`s sum over keystroke

SUm over keystrokes from 10 to 90

vec <- c()
for (i in 1:nrow(data.set)) {
  vec <- c(vec, sum(data.set[i, 3:10]))
}
data.set$sum.10.80 <- vec 

Filtering for treshold

ind <- 90
for (i in 11:14) {
  ind.str <- paste(toString(ind),"bin",sep="")
  treshold <- mean(data.set[, i]) - 0.4*mean(data.set[, i])
  data.set[, i] <- ifelse(data.set[, i] >= treshold, 1, 0)
  ind <- ind + 10
}

### adding keystroke >= 120 to outliers
for ( i in 1:nrow(data.set)) {
  data.set[i, 20] <- data.set[i, 20] + sum(data.set[i, 14:19])
}
model.2 <- lm(essay_sent ~ sum.10.80 + keystroke90 + keystroke100 + 
                keystroke110 + ouliers, data=data.set)
summary(model.2)
## 
## Call:
## lm(formula = essay_sent ~ sum.10.80 + keystroke90 + keystroke100 + 
##     keystroke110 + ouliers, data = data.set)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.575 -3.403  1.626  2.445  3.567 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.9614775  0.1308253  45.568  < 2e-16 ***
## sum.10.80    -0.0014267  0.0005039  -2.831  0.00465 ** 
## keystroke90  -0.2407210  0.0868967  -2.770  0.00562 ** 
## keystroke100  0.0811285  0.0959841   0.845  0.39803    
## keystroke110 -0.0044195  0.0912404  -0.048  0.96137    
## ouliers      -0.0025744  0.0011270  -2.284  0.02240 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.683 on 4794 degrees of freedom
## Multiple R-squared:  0.00414,    Adjusted R-squared:  0.003101 
## F-statistic: 3.986 on 5 and 4794 DF,  p-value: 0.001309

Poisson Model

library(rsq)
library(ggplot2)
poisson.model <- glm(essay_sent ~ sum.10.80 + keystroke90 + keystroke100 + 
                keystroke110 + ouliers + factor(name), data=data.set, family = poisson(link = "log"))
summary(poisson.model)
## 
## Call:
## glm(formula = essay_sent ~ sum.10.80 + keystroke90 + keystroke100 + 
##     keystroke110 + ouliers + factor(name), family = poisson(link = "log"), 
##     data = data.set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4899  -1.3529   0.2351   0.5320   1.5275  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             1.514e+00  2.404e-02  62.972   <2e-16 ***
## sum.10.80               1.542e-04  8.233e-05   1.873   0.0611 .  
## keystroke90             4.103e-03  1.371e-02   0.299   0.7648    
## keystroke100            2.424e-02  1.524e-02   1.591   0.1117    
## keystroke110           -1.347e-02  1.449e-02  -0.929   0.3526    
## ouliers                 2.434e-04  1.826e-04   1.333   0.1826    
## factor(name)GunControl -3.798e-02  1.637e-02  -2.320   0.0203 *  
## factor(name)ReviewAMT   4.263e-01  1.497e-02  28.465   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 7072.0  on 4799  degrees of freedom
## Residual deviance: 5832.6  on 4792  degrees of freedom
## AIC: 22309
## 
## Number of Fisher Scoring iterations: 4
rsq(poisson.model,adj=TRUE)
## [1] 0.2046907
plot(predict(poisson.model), data.set$essay_sent, xlab="predicted",ylab="actual")
 abline(a=0,b=1)

Logistic regression for sentiment

data.set$essay_sentbin <- ifelse(data.set$essay_sent >= 5, 1, 0)
logistic.model <- glm(essay_sentbin ~ sum.10.80 + keystroke90 + keystroke100 + 
                keystroke110 + ouliers , data=data.set, family = "binomial")
summary(logistic.model)
## 
## Call:
## glm(formula = essay_sentbin ~ sum.10.80 + keystroke90 + keystroke100 + 
##     keystroke110 + ouliers, family = "binomial", data = data.set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4547  -1.3236   0.9754   1.0280   1.3731  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.6611704  0.0992363   6.663 2.69e-11 ***
## sum.10.80    -0.0008856  0.0003794  -2.334   0.0196 *  
## keystroke90  -0.1674460  0.0663682  -2.523   0.0116 *  
## keystroke100  0.0649890  0.0728704   0.892   0.3725    
## keystroke110 -0.0213246  0.0693270  -0.308   0.7584    
## ouliers      -0.0019778  0.0008508  -2.325   0.0201 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6494.9  on 4799  degrees of freedom
## Residual deviance: 6478.7  on 4794  degrees of freedom
## AIC: 6490.7
## 
## Number of Fisher Scoring iterations: 4
plot(model_pred, data.set$essay_sentbin)

model_pred <- predict(logistic.model)
model.res <- data.set$essay_sentbin - model_pred
plot(model_pred, model.res)
abline(h=0)

Logistic regression for fake/true prediction

logistic.model.type <- glm(essay_typebin ~ sum.10.80 + ouliers , data=data.set, family = "binomial")
summary(logistic.model.type)
## 
## Call:
## glm(formula = essay_typebin ~ sum.10.80 + ouliers, family = "binomial", 
##     data = data.set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8028  -1.1571  -0.2197   1.1890   1.3287  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.4230621  0.0806654  -5.245 1.57e-07 ***
## sum.10.80    0.0021525  0.0003881   5.547 2.91e-08 ***
## ouliers      0.0022115  0.0008285   2.669   0.0076 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6654.2  on 4799  degrees of freedom
## Residual deviance: 6620.4  on 4797  degrees of freedom
## AIC: 6626.4
## 
## Number of Fisher Scoring iterations: 4

Scaling the data

data.set.scaled <- data.set %>% mutate_at(c("keystroke10", "keystroke20", "keystroke30", "keystroke40", "keystroke50", "keystroke60", "keystroke70", "keystroke80", "keystroke90", "keystroke100", "keystroke110", "keystroke120", "keystroke130", "keystroke140", "keystroke150", "keystroke160", "keystroke170"), ~(scale(.) %>% as.vector))

Doesn`t work for us, because we get negative values for keystrokes

Ordinal Logistic Regression

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
summary(data.set)
##        id           name            keystroke10      keystroke20    
##  Min.   :   0   Length:4800        Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:1200   Class :character   1st Qu.:  2.00   1st Qu.: 10.00  
##  Median :2400   Mode  :character   Median : 11.00   Median : 17.00  
##  Mean   :2400                      Mean   : 15.49   Mean   : 20.97  
##  3rd Qu.:3599                      3rd Qu.: 22.00   3rd Qu.: 28.00  
##  Max.   :4799                      Max.   :233.00   Max.   :187.00  
##   keystroke30      keystroke40      keystroke50      keystroke60    
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:  8.00   1st Qu.:  8.00   1st Qu.:  7.00   1st Qu.: 14.00  
##  Median : 13.00   Median : 14.00   Median : 13.00   Median : 23.00  
##  Mean   : 15.21   Mean   : 16.82   Mean   : 15.54   Mean   : 28.07  
##  3rd Qu.: 19.25   3rd Qu.: 21.00   3rd Qu.: 20.00   3rd Qu.: 37.00  
##  Max.   :159.00   Max.   :299.00   Max.   :218.00   Max.   :229.00  
##   keystroke70      keystroke80      keystroke90      keystroke100   
##  Min.   :  0.00   Min.   :  0.00   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 11.00   1st Qu.: 14.00   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 19.00   Median : 22.00   Median :1.0000   Median :1.0000  
##  Mean   : 22.34   Mean   : 25.64   Mean   :0.6723   Mean   :0.7192  
##  3rd Qu.: 29.00   3rd Qu.: 33.00   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :219.00   Max.   :172.00   Max.   :1.0000   Max.   :1.0000  
##   keystroke110     keystroke120     keystroke130      keystroke140   
##  Min.   :0.0000   Min.   :0.0000   Min.   :  0.000   Min.   : 0.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:  2.000   1st Qu.: 2.000  
##  Median :1.0000   Median :1.0000   Median :  5.000   Median : 5.000  
##  Mean   :0.6523   Mean   :0.5723   Mean   :  7.471   Mean   : 8.732  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.: 11.000   3rd Qu.:12.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :152.000   Max.   :97.000  
##   keystroke150     keystroke160     keystroke170       ouliers      
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   :  0.00  
##  1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 11.00  
##  Median : 2.000   Median : 1.000   Median : 1.000   Median : 24.00  
##  Mean   : 3.268   Mean   : 3.002   Mean   : 1.625   Mean   : 35.95  
##  3rd Qu.: 4.000   3rd Qu.: 4.000   3rd Qu.: 2.000   3rd Qu.: 47.00  
##  Max.   :43.000   Max.   :46.000   Max.   :31.000   Max.   :368.00  
##    essay_sent     essay_type        essay_typebin   sum.10.80    
##  Min.   :1.000   Length:4800        Min.   :0.0   Min.   :  4.0  
##  1st Qu.:2.000   Class :character   1st Qu.:0.0   1st Qu.:111.0  
##  Median :7.000   Mode  :character   Median :0.5   Median :148.0  
##  Mean   :5.534                      Mean   :0.5   Mean   :160.1  
##  3rd Qu.:8.000                      3rd Qu.:1.0   3rd Qu.:190.0  
##  Max.   :8.000                      Max.   :1.0   Max.   :935.0  
##  essay_sentbin   
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :1.0000  
##  Mean   :0.5908  
##  3rd Qu.:1.0000  
##  Max.   :1.0000
data.set$essay_sent3 <- ifelse(data.set$essay_sent >= 6, "positive", ifelse(data.set$essay_sent <4, "negative", "neutural"))

model.polr <- polr(factor(essay_sent3)~essay_typebin + keystroke90 + keystroke100 + keystroke110 + keystroke120 + factor(name), data = data.set, Hess = TRUE)
summary(model.polr)
## Call:
## polr(formula = factor(essay_sent3) ~ essay_typebin + keystroke90 + 
##     keystroke100 + keystroke110 + keystroke120 + factor(name), 
##     data = data.set, Hess = TRUE)
## 
## Coefficients:
##                           Value Std. Error t value
## essay_typebin          -0.10718    0.06016 -1.7815
## keystroke90             0.05783    0.06959  0.8311
## keystroke100            0.07985    0.07441  1.0731
## keystroke110           -0.02684    0.07235 -0.3710
## keystroke120           -0.05841    0.06891 -0.8476
## factor(name)GunControl -0.09055    0.06548 -1.3828
## factor(name)ReviewAMT   2.29116    0.09020 25.4009
## 
## Intercepts:
##                   Value   Std. Error t value
## negative|neutural -0.3488  0.0864    -4.0364
## neutural|positive  0.3790  0.0865     4.3792
## 
## Residual Deviance: 8256.865 
## AIC: 8274.865
(ctable <- coef(summary(model.polr)))
##                              Value Std. Error    t value
## essay_typebin          -0.10718102 0.06016445 -1.7814676
## keystroke90             0.05783411 0.06958906  0.8310804
## keystroke100            0.07984790 0.07440792  1.0731102
## keystroke110           -0.02684282 0.07234750 -0.3710263
## keystroke120           -0.05840766 0.06891146 -0.8475754
## factor(name)GunControl -0.09054923 0.06548123 -1.3828272
## factor(name)ReviewAMT   2.29116469 0.09020027 25.4008641
## negative|neutural      -0.34878002 0.08640819 -4.0364231
## neutural|positive       0.37895330 0.08653444  4.3792195
p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
(ctable <- cbind(ctable, "p value" = p))
##                              Value Std. Error    t value       p value
## essay_typebin          -0.10718102 0.06016445 -1.7814676  7.483609e-02
## keystroke90             0.05783411 0.06958906  0.8310804  4.059282e-01
## keystroke100            0.07984790 0.07440792  1.0731102  2.832217e-01
## keystroke110           -0.02684282 0.07234750 -0.3710263  7.106180e-01
## keystroke120           -0.05840766 0.06891146 -0.8475754  3.966745e-01
## factor(name)GunControl -0.09054923 0.06548123 -1.3828272  1.667179e-01
## factor(name)ReviewAMT   2.29116469 0.09020027 25.4008641 2.467032e-142
## negative|neutural      -0.34878002 0.08640819 -4.0364231  5.427231e-05
## neutural|positive       0.37895330 0.08653444  4.3792195  1.191051e-05

Filtering the data

Here i deleted every observation where number of keystrokes was less than on 0.95 conf interval

library(dplyr)

data.set.filtered <- data.set %>% filter(data.set$keystroke10 <= quantile(data.set$keystroke10, c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke20 <= quantile(data.set.filtered$keystroke20,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke30 <= quantile(data.set.filtered$keystroke30,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke40 <= quantile(data.set.filtered$keystroke40,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke50 <= quantile(data.set.filtered$keystroke50,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke60 <= quantile(data.set.filtered$keystroke60,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke70 <= quantile(data.set.filtered$keystroke70,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke80 <= quantile(data.set.filtered$keystroke80,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke90 <= quantile(data.set.filtered$keystroke90,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke100 <= quantile(data.set.filtered$keystroke100,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke110 <= quantile(data.set.filtered$keystroke110,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke120 <= quantile(data.set.filtered$keystroke120,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke130 <= quantile(data.set.filtered$keystroke130,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke140 <= quantile(data.set.filtered$keystroke140,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke150 <= quantile(data.set.filtered$keystroke150,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke160 <= quantile(data.set.filtered$keystroke160,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke170 <= quantile(data.set.filtered$keystroke170,c(.95)))
p.10 <- ggplot(data=data.set.filtered, aes(keystroke10, colour=name)) + 
  geom_histogram(fill="white")
p.10
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p.20 <- ggplot(data=data.set.filtered, aes(keystroke20, colour=name)) + 
  geom_histogram(fill="white")
p.20
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p.30 <- ggplot(data=data.set.filtered, aes(keystroke30, colour=name)) + 
  geom_histogram(fill="white")
p.30
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p.40 <- ggplot(data=data.set.filtered, aes(keystroke40, colour=name)) + 
  geom_histogram(fill="white")
p.40
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p.50 <- ggplot(data=data.set.filtered, aes(keystroke50, colour=name)) + 
  geom_histogram(fill="white")
p.50
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p.60 <- ggplot(data=data.set.filtered, aes(keystroke60, colour=name)) + 
  geom_histogram(fill="white")
p.60
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p.70 <- ggplot(data=data.set.filtered, aes(keystroke70, colour=name)) + 
  geom_histogram(fill="white")
p.70
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p.80 <- ggplot(data=data.set.filtered, aes(keystroke80, colour=name)) + 
  geom_histogram(fill="white")
p.80
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p.90 <- ggplot(data=data.set.filtered, aes(keystroke90, colour=name)) + 
  geom_histogram(fill="white")
p.90
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Simple Linear Model over Sentiment and Keystrokes

model.1 <- lm(essay_sent ~ keystroke10 + keystroke20 + keystroke30 + keystroke30 + keystroke40 + 
                keystroke50 + keystroke60 + keystroke70 + keystroke80 + keystroke90 + keystroke100 + 
                keystroke110 + keystroke120 + keystroke130 + keystroke140 + keystroke150 + keystroke160
              + keystroke170 , data=data.set.filtered)
summary(model.1)
## 
## Call:
## lm(formula = essay_sent ~ keystroke10 + keystroke20 + keystroke30 + 
##     keystroke30 + keystroke40 + keystroke50 + keystroke60 + keystroke70 + 
##     keystroke80 + keystroke90 + keystroke100 + keystroke110 + 
##     keystroke120 + keystroke130 + keystroke140 + keystroke150 + 
##     keystroke160 + keystroke170, data = data.set.filtered)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.922 -3.325  1.723  2.367  3.224 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.292198   0.277062  22.710   <2e-16 ***
## keystroke10  -0.002537   0.005667  -0.448   0.6544    
## keystroke20   0.008063   0.006945   1.161   0.2458    
## keystroke30   0.005518   0.010600   0.521   0.6027    
## keystroke40  -0.014789   0.009444  -1.566   0.1175    
## keystroke50  -0.005756   0.011659  -0.494   0.6216    
## keystroke60  -0.007546   0.006422  -1.175   0.2401    
## keystroke70   0.006080   0.007171   0.848   0.3966    
## keystroke80  -0.012943   0.006103  -2.121   0.0340 *  
## keystroke90  -0.077262   0.150457  -0.514   0.6076    
## keystroke100  0.058055   0.150124   0.387   0.6990    
## keystroke110 -0.115113   0.141491  -0.814   0.4160    
## keystroke120 -0.184520   0.137898  -1.338   0.1810    
## keystroke130 -0.008764   0.013688  -0.640   0.5220    
## keystroke140  0.025326   0.013683   1.851   0.0643 .  
## keystroke150 -0.012435   0.029461  -0.422   0.6730    
## keystroke160 -0.012792   0.036552  -0.350   0.7264    
## keystroke170 -0.064968   0.055886  -1.163   0.2451    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.68 on 2575 degrees of freedom
## Multiple R-squared:  0.01174,    Adjusted R-squared:  0.005215 
## F-statistic: 1.799 on 17 and 2575 DF,  p-value: 0.02299
ggplot(data=data.set.filtered, aes(x = keystroke10 + keystroke20 + keystroke30 + keystroke30 + keystroke40 + 
                keystroke50 + keystroke60 + keystroke70 + keystroke80 + keystroke90 + keystroke100 + 
                keystroke110 + keystroke120 + keystroke130 + keystroke140 + keystroke150 + keystroke160
              + keystroke170,  y= essay_sent)) + geom_point() + geom_smooth(method = 'lm')
## `geom_smooth()` using formula 'y ~ x'

model_pred <- predict(model.1)
model.res <- data.set.filtered$essay_sent - model_pred
plot(model_pred, model.res)
abline(h=0)

Fighting the problem of too many variables, because they become insignificant. Let`s sum over keystroke

SUm over keystrokes from 10 to 90

vec <- c()
for (i in 1:nrow(data.set.filtered)) {
  vec <- c(vec, sum(data.set.filtered[i, 3:10]))
}
data.set.filtered$sum.10.80 <- vec 

Filtering for treshold

ind <- 90
for (i in 11:14) {
  ind.str <- paste(toString(ind),"bin",sep="")
  treshold <- mean(data.set.filtered[, i]) - 0.4*mean(data.set.filtered[, i])
  data.set.filtered[, i] <- ifelse(data.set.filtered[, i] >= treshold, 1, 0)
  ind <- ind + 10
}

### adding keystroke >= 120 to outliers
for ( i in 1:nrow(data.set.filtered)) {
  data.set.filtered[i, 20] <- data.set.filtered[i, 20] + sum(data.set.filtered[i, 14:19])
}

Poisson Model

library(rsq)
library(ggplot2)
poisson.model <- glm(essay_sent ~ sum.10.80 + keystroke90 + keystroke100 + 
                keystroke110 + ouliers + factor(name), data=data.set.filtered, family = poisson(link = "log"))
summary(poisson.model)
## 
## Call:
## glm(formula = essay_sent ~ sum.10.80 + keystroke90 + keystroke100 + 
##     keystroke110 + ouliers + factor(name), family = poisson(link = "log"), 
##     data = data.set.filtered)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3980  -1.3355   0.2531   0.4077   1.5595  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             1.5799853  0.0477884  33.062  < 2e-16 ***
## sum.10.80               0.0001009  0.0002306   0.438  0.66161    
## keystroke90            -0.0173143  0.0209522  -0.826  0.40859    
## keystroke100            0.0300668  0.0225342   1.334  0.18211    
## keystroke110           -0.0252806  0.0203543  -1.242  0.21423    
## ouliers                -0.0001464  0.0003287  -0.445  0.65606    
## factor(name)GunControl -0.0656833  0.0228398  -2.876  0.00403 ** 
## factor(name)ReviewAMT   0.3936910  0.0197869  19.897  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 3796.5  on 2592  degrees of freedom
## Residual deviance: 3142.9  on 2585  degrees of freedom
## AIC: 12098
## 
## Number of Fisher Scoring iterations: 4
rsq(poisson.model,adj=TRUE)
## [1] 0.1996001
plot(predict(poisson.model), data.set.filtered$essay_sent, xlab="predicted",ylab="actual")
 abline(a=0,b=1)

Logistic regression for sentiment

data.set.filtered$essay_sentbin <- ifelse(data.set.filtered$essay_sent >= 5, 1, 0)
logistic.model <- glm(essay_sentbin ~ sum.10.80 + keystroke90 + keystroke100 + 
                keystroke110 + ouliers , data=data.set.filtered, family = "binomial")
summary(logistic.model)
## 
## Call:
## glm(formula = essay_sentbin ~ sum.10.80 + keystroke90 + keystroke100 + 
##     keystroke110 + ouliers, family = "binomial", data = data.set.filtered)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5746  -1.3431   0.9539   1.0117   1.1442  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.9497085  0.2156885   4.403 1.07e-05 ***
## sum.10.80    -0.0019892  0.0010958  -1.815   0.0695 .  
## keystroke90  -0.1772353  0.1048172  -1.691   0.0909 .  
## keystroke100  0.0128717  0.1098422   0.117   0.9067    
## keystroke110 -0.1156146  0.0993287  -1.164   0.2444    
## ouliers      -0.0008932  0.0015828  -0.564   0.5726    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3477.5  on 2592  degrees of freedom
## Residual deviance: 3469.1  on 2587  degrees of freedom
## AIC: 3481.1
## 
## Number of Fisher Scoring iterations: 4

Logistic regression for fake/true prediction

logistic.model.type <- glm(essay_typebin ~ sum.10.80 + ouliers , data=data.set.filtered, family = "binomial")
summary(logistic.model.type)
## 
## Call:
## glm(formula = essay_typebin ~ sum.10.80 + ouliers, family = "binomial", 
##     data = data.set.filtered)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.277  -1.144  -1.098   1.209   1.318  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -0.396022   0.182820  -2.166   0.0303 *
## sum.10.80    0.001930   0.001046   1.845   0.0650 .
## ouliers      0.001343   0.001469   0.914   0.3607  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3591.0  on 2592  degrees of freedom
## Residual deviance: 3587.6  on 2590  degrees of freedom
## AIC: 3593.6
## 
## Number of Fisher Scoring iterations: 3
rsq(logistic.model.type)
## [1] 0.00130308

Ordinal Logistic Regression

library(MASS)
summary(data.set)
##        id           name            keystroke10      keystroke20    
##  Min.   :   0   Length:4800        Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:1200   Class :character   1st Qu.:  2.00   1st Qu.: 10.00  
##  Median :2400   Mode  :character   Median : 11.00   Median : 17.00  
##  Mean   :2400                      Mean   : 15.49   Mean   : 20.97  
##  3rd Qu.:3599                      3rd Qu.: 22.00   3rd Qu.: 28.00  
##  Max.   :4799                      Max.   :233.00   Max.   :187.00  
##   keystroke30      keystroke40      keystroke50      keystroke60    
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:  8.00   1st Qu.:  8.00   1st Qu.:  7.00   1st Qu.: 14.00  
##  Median : 13.00   Median : 14.00   Median : 13.00   Median : 23.00  
##  Mean   : 15.21   Mean   : 16.82   Mean   : 15.54   Mean   : 28.07  
##  3rd Qu.: 19.25   3rd Qu.: 21.00   3rd Qu.: 20.00   3rd Qu.: 37.00  
##  Max.   :159.00   Max.   :299.00   Max.   :218.00   Max.   :229.00  
##   keystroke70      keystroke80      keystroke90      keystroke100   
##  Min.   :  0.00   Min.   :  0.00   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 11.00   1st Qu.: 14.00   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 19.00   Median : 22.00   Median :1.0000   Median :1.0000  
##  Mean   : 22.34   Mean   : 25.64   Mean   :0.6723   Mean   :0.7192  
##  3rd Qu.: 29.00   3rd Qu.: 33.00   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :219.00   Max.   :172.00   Max.   :1.0000   Max.   :1.0000  
##   keystroke110     keystroke120     keystroke130      keystroke140   
##  Min.   :0.0000   Min.   :0.0000   Min.   :  0.000   Min.   : 0.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:  2.000   1st Qu.: 2.000  
##  Median :1.0000   Median :1.0000   Median :  5.000   Median : 5.000  
##  Mean   :0.6523   Mean   :0.5723   Mean   :  7.471   Mean   : 8.732  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.: 11.000   3rd Qu.:12.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :152.000   Max.   :97.000  
##   keystroke150     keystroke160     keystroke170       ouliers      
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   :  0.00  
##  1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 11.00  
##  Median : 2.000   Median : 1.000   Median : 1.000   Median : 24.00  
##  Mean   : 3.268   Mean   : 3.002   Mean   : 1.625   Mean   : 35.95  
##  3rd Qu.: 4.000   3rd Qu.: 4.000   3rd Qu.: 2.000   3rd Qu.: 47.00  
##  Max.   :43.000   Max.   :46.000   Max.   :31.000   Max.   :368.00  
##    essay_sent     essay_type        essay_typebin   sum.10.80    
##  Min.   :1.000   Length:4800        Min.   :0.0   Min.   :  4.0  
##  1st Qu.:2.000   Class :character   1st Qu.:0.0   1st Qu.:111.0  
##  Median :7.000   Mode  :character   Median :0.5   Median :148.0  
##  Mean   :5.534                      Mean   :0.5   Mean   :160.1  
##  3rd Qu.:8.000                      3rd Qu.:1.0   3rd Qu.:190.0  
##  Max.   :8.000                      Max.   :1.0   Max.   :935.0  
##  essay_sentbin    essay_sent3       
##  Min.   :0.0000   Length:4800       
##  1st Qu.:0.0000   Class :character  
##  Median :1.0000   Mode  :character  
##  Mean   :0.5908                     
##  3rd Qu.:1.0000                     
##  Max.   :1.0000
data.set.filtered$essay_sent3 <- ifelse(data.set.filtered$essay_sent >= 6, "positive", ifelse(data.set.filtered$essay_sent <4, "negative", "neutural"))

model.polr <- polr(factor(essay_sent3)~essay_typebin + keystroke90 + keystroke100 + keystroke110 + keystroke120 + factor(name), data = data.set.filtered, Hess = TRUE)
summary(model.polr)
## Call:
## polr(formula = factor(essay_sent3) ~ essay_typebin + keystroke90 + 
##     keystroke100 + keystroke110 + keystroke120 + factor(name), 
##     data = data.set.filtered, Hess = TRUE)
## 
## Coefficients:
##                           Value Std. Error t value
## essay_typebin          -0.05978    0.08277 -0.7223
## keystroke90            -0.05280    0.11080 -0.4765
## keystroke100            0.12824    0.11265  1.1384
## keystroke110           -0.05517    0.10253 -0.5380
## keystroke120           -0.14468    0.09808 -1.4752
## factor(name)GunControl -0.16176    0.09179 -1.7623
## factor(name)ReviewAMT   2.10370    0.11573 18.1782
## 
## Intercepts:
##                   Value   Std. Error t value
## negative|neutural -0.4673  0.1356    -3.4459
## neutural|positive  0.1961  0.1354     1.4486
## 
## Residual Deviance: 4361.228 
## AIC: 4379.228
(ctable <- coef(summary(model.polr)))
##                              Value Std. Error    t value
## essay_typebin          -0.05978167 0.08276899 -0.7222713
## keystroke90            -0.05279813 0.11080418 -0.4764994
## keystroke100            0.12824405 0.11265257  1.1384032
## keystroke110           -0.05516576 0.10252940 -0.5380482
## keystroke120           -0.14468449 0.09807725 -1.4752095
## factor(name)GunControl -0.16175591 0.09178606 -1.7623145
## factor(name)ReviewAMT   2.10369794 0.11572673 18.1781509
## negative|neutural      -0.46726885 0.13560294 -3.4458608
## neutural|positive       0.19607668 0.13535562  1.4486039
p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
(ctable <- cbind(ctable, "p value" = p))
##                              Value Std. Error    t value      p value
## essay_typebin          -0.05978167 0.08276899 -0.7222713 4.701277e-01
## keystroke90            -0.05279813 0.11080418 -0.4764994 6.337186e-01
## keystroke100            0.12824405 0.11265257  1.1384032 2.549522e-01
## keystroke110           -0.05516576 0.10252940 -0.5380482 5.905438e-01
## keystroke120           -0.14468449 0.09807725 -1.4752095 1.401562e-01
## factor(name)GunControl -0.16175591 0.09178606 -1.7623145 7.801617e-02
## factor(name)ReviewAMT   2.10369794 0.11572673 18.1781509 7.688278e-74
## negative|neutural      -0.46726885 0.13560294 -3.4458608 5.692435e-04
## neutural|positive       0.19607668 0.13535562  1.4486039 1.474482e-01

Random Forest for variables selection

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
randForest.selection <- randomForest(factor(data.set.filtered$essay_sent3)~., data= data.set.filtered[, c(3:20)])
importance(randForest.selection)
##              MeanDecreaseGini
## keystroke10         114.28441
## keystroke20         121.02728
## keystroke30         107.11058
## keystroke40         111.68764
## keystroke50         109.70164
## keystroke60         125.81134
## keystroke70         120.45692
## keystroke80         121.69747
## keystroke90          13.19775
## keystroke100         16.15979
## keystroke110         17.85811
## keystroke120         16.02276
## keystroke130         91.52549
## keystroke140         91.89723
## keystroke150         64.13282
## keystroke160         59.86850
## keystroke170         44.12682
## ouliers             125.33648